Importing Libraries
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
import os
Importing data
The Lending Club Dataset is used for this project: a large US peer-to-peer lending company. There are several different versions of this dataset. We have used the updated dataset (version 2), which is available on kaggle: https://www.kaggle.com/wendykan/lending-club-loan-data/
We divided the data into two periods because we assume that some data are available at the moment when we need to build Expected Loss models, and some data comes from applications after. Later, we investigate whether the applications we have after we built the Probability of Default (PD) model have similar characteristics with the applications we used to build the PD model.
loan = r'F:\Data Analysis\Springboard\Data Science Career Track\Projects\Capstone 2\lending club loan data_version 2\loan.csv'
loan_data_backup = pd.read_csv(loan)
loan_data = loan_data_backup.copy()
Explore Data
loan_data.head()
loan_data.tail()
#Display all columns
#pd.options.display.max_columns = None
#loan_data
#Display all rows
#pd.options.display.max_rows = None
#loan_data
loan_data.columns.values
# Displays column names, complete (non-missing) cases per column, and datatype per column.
loan_data.info()
loan_data.dtypes
loan_data.shape
Pre-processing few continuous variables: emp_length, earlist_cr_line, term, issue_d
# Display unique values of a column.
loan_data['emp_length'].unique()
The emp_length has four things we have to remove to be able to convert it into an integer:
1.+ years
2.< 1 year
3.nan
4.years and year (space years and space year)
#Coverting the employment lenght from object into integer. We will store the new variable as 'employment length int'
# 1.Assign the new ‘employment length int’ to be equal to the ‘employment length’ variable with the string ‘+ years’
loan_data['emp_length_int'] = loan_data['emp_length'].str.replace('\+ years', '')
# 2Replace the whole string ‘less than 1 year’ with the string ‘0’.
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('< 1 year', str(0))
# 3.Replace the ‘n/a’ string with the string ‘0’.
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace('n/a', str(0))
# 4.Replace the string ‘space years’ and 'space year' with nothing.
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' years', '')
loan_data['emp_length_int'] = loan_data['emp_length_int'].str.replace(' year', '')
# Checks the datatype of a single element of a column.
type(loan_data['emp_length_int'][0])
Now we transform it into numeric
loan_data['emp_length_int'] = pd.to_numeric(loan_data['emp_length_int'])
type(loan_data['emp_length_int'][0])
Next is earliest credit line
# Next is 'earliest credit line'
#loan_data['earliest_cr_line']
type(loan_data['earliest_cr_line'][0])
# 'earliest credit line' is a date variable. We can extracts the date and the time from a string variable that is in a given format.
#loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], format= '%b-%Y')
#loan_data['earliest_cr_line_date'] = loan_data['earliest_cr_line'].apply(pd.to_datetime)
loan_data['earliest_cr_line_date'] = pd.to_datetime(loan_data['earliest_cr_line'], infer_datetime_format=True)
type(loan_data['earliest_cr_line'][0])
# Calculates the difference between two dates and times.
pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']
# Assume we are now in December 2017.We calculate the difference between two dates in months, turn it to numeric datatype and round it.
# We save the result in a new variable.
loan_data['mths_since_earliest_cr_line'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['earliest_cr_line_date']) / np.timedelta64(1, 'M')))
# Shows some descriptive statisics for the values of a column.
# Dates from 1969 and before are not being converted well, i.e., they have become 2069 and similar,
# and negative differences are being calculated.
loan_data['mths_since_earliest_cr_line'].describe()
###
# We take three columns from the dataframe. Then, we display them only for the rows where a variable has negative value.
loan_data.loc[: , ['earliest_cr_line', 'earliest_cr_line_date', 'mths_since_earliest_cr_line']][loan_data['mths_since_earliest_cr_line'] < 0]
###
# We set the rows that had negative differences to the maximum value.
loan_data['mths_since_earliest_cr_line'][loan_data['mths_since_earliest_cr_line'] < 0] = loan_data['mths_since_earliest_cr_line'].max()
# Calculates and shows the minimum value of a column.
min(loan_data['mths_since_earliest_cr_line'])
Next variable to pre-process is term
loan_data['term']
# Shows some descriptive statisics for the values of a column.
loan_data['term'].describe()
# We replace a string with another string, in this case, with an empty strng (i.e. with nothing).
loan_data['term_int'] = loan_data['term'].str.replace(' months', '')
type(loan_data['term_int'][0])
# We remplace a string from a variable with another string, in this case, with an empty strng (i.e. with nothing).
# We turn the result to numeric datatype and save it in another variable.
loan_data['term_int'] = pd.to_numeric(loan_data['term'].str.replace(' months', ''))
# Checks the datatype of a single element of a column.
type(loan_data['term_int'][0])
Next variable to pre-process is issue_d
loan_data['issue_d']
# Assume we are now in December 2017
# Extracts the date and the time from a string variable that is in a given format.
loan_data['issue_d_date'] = pd.to_datetime(loan_data['issue_d'], infer_datetime_format=True)
# We calculate the difference between two dates in months, turn it to numeric datatype and round it.
# We save the result in a new variable.
loan_data['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2017-12-01') - loan_data['issue_d_date']) / np.timedelta64(1, 'M')))
# Shows some descriptive statisics for the values of a column.
loan_data['mths_since_issue_d'].describe()
Preprocessing few discrete/categorical variables
Variables: grade, sub_grade, home_owenership, purpose, addr_state, initial_list_status. We are not going to use sub_grade, as it overlaps with grade.
# Displays column names, complete (non-missing) cases per column, and datatype per column.
loan_data.info()
loan_data.dtypes
# We create dummy variables from all 8 original independent variables, and save them into a list.
loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),
pd.get_dummies(loan_data['sub_grade'], prefix = 'sub_grade', prefix_sep = ':'),
pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),
pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':'),
pd.get_dummies(loan_data['loan_status'], prefix = 'loan_status', prefix_sep = ':'),
pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),
pd.get_dummies(loan_data['addr_state'], prefix = 'addr_state', prefix_sep = ':'),
pd.get_dummies(loan_data['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ':')]
# We concatenate the dummy variables and this turns them into a dataframe.
loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)
type(loan_data_dummies)
# We concatenate the original loan_data with the dataframe with dummy variables, along the columns.
loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)
# Displays all column names.
loan_data.columns.values
Checking and handling missing and NA values
We will be using these variables in our analysis. Let's check for their missing values and fill them: annual_inc, delinq_2yrs, inq_last_6mths, open_acc, pub_rec, total_acc, acc_now_delinq, total_rev_hi_lim, emp_length_int, and mths_since_earliest_cr_line
# It returns 'False' if a value is not missing and 'True' if a value is missing, for each value in a dataframe.
loan_data.isnull()
# Sets the pandas dataframe options to display all columns/ rows.
#pd.options.display.max_rows = None
#loan_data.isnull().sum()
# Sets the pandas dataframe options to display all columns/ rows.
#pd.options.display.max_columns = None
#loan_data.isnull().sum()
One way to deal with missing values is to remove all observations(rows) where we have missing values. Another way to deal with missing values is to impute them
Let's start with total_rev_hi_lim
#'Total revolving high credit/ credit limit' will most likely be equal to 'funded_amnt'.
#So we replace the missing values in that with the values from funded_amnt
loan_data['total_rev_hi_lim'].fillna(loan_data['funded_amnt'], inplace=True)
#Checking to see if there are still any missing values
loan_data['total_rev_hi_lim'].isnull().sum()
Let's do for annual_inc
# We will fill the missing values with the mean value of the non-missing values for 'annual_inc'.
loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(), inplace=True)
loan_data['annual_inc'].isnull().sum()
For the others, we will fill them with zeros
# We fill the missing values with zeroes.
loan_data['mths_since_earliest_cr_line'].fillna(0, inplace=True)
loan_data['acc_now_delinq'].fillna(0, inplace=True)
loan_data['total_acc'].fillna(0, inplace=True)
loan_data['pub_rec'].fillna(0, inplace=True)
loan_data['open_acc'].fillna(0, inplace=True)
loan_data['inq_last_6mths'].fillna(0, inplace=True)
loan_data['delinq_2yrs'].fillna(0, inplace=True)
loan_data['emp_length_int'].fillna(0, inplace=True)
Data Preparations
The Dependent Variable will be Good/ Bad (Default) loan. The definition used here is that, accounts are considered as default (bad loan) if the borrower has been 90 days past due on the loan. Also, a borrower is considered default if the borrower commits fraud. The variable loan_status is used to determine if a customer has defaulted or not.
# Displays unique values of loan_status column
loan_data['loan_status'].unique()
# Calculates the number of observations for each unique value of a variable
loan_data['loan_status'].value_counts()
#Total loans issued
loan_data['loan_status'].count()
#Loan proportion of each observation
loan_data['loan_status'].value_counts() / loan_data['loan_status'].count()
# Good/ Bad loan.# We create a new variable that has the value of '0' if a condition is met (Default), and the value of '1' if it is not met (Non-default).
loan_data['good_bad'] = np.where(loan_data['loan_status'].isin(['Charged Off', 'Default',
'Does not meet the credit policy. Status:Charged Off',
'Late (31-120 days)']), 0, 1)
loan_data['good_bad']
# We split two dataframes with inputs and targets, each into a train and test dataframe, and store them in variables.
# We set the size of the test dataset to be 20% and the train dataset becomes 80%.
# We also set a specific random state.This would allow us to perform the exact same split multimple times.
# This means, to assign the exact same observations to the train and test datasets.
loan_data_inputs_train, loan_data_inputs_test, loan_data_targets_train, loan_data_targets_test = train_test_split(loan_data.drop('good_bad', axis = 1), loan_data['good_bad'], test_size = 0.2, random_state = 42)
# Displaying the size of the dataframes
print(loan_data_inputs_train.shape)
print(loan_data_targets_train.shape)
print(loan_data_inputs_test.shape)
print(loan_data_targets_test.shape)
Preprocessing More Discrete/ Categorical Variables
Creating variables
n_obs is total number of observationsWOE is Weight of Evidencen_good is number of good loansn_bad is number od bad loansprop_good is proportion of good borrowersprop_bad is proportion of bad borrowersprop_n_obs is proportion of observationsprop_n_good is proportion of the number of good borrowersprop_n_bad is proportion of the nuber of bad borrowersIV is information valuediff_prop_good is difference of the proportion of good borrowersdf_inputs_prepr = loan_data_inputs_train
df_targets_prepr = loan_data_targets_train
# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.
# WoE function for discrete unordered variables
def woe_discrete(df, discrete_variabe_name, good_bad_variable_df):
df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)
df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
df = df.iloc[:, [0, 1, 3]]
df.columns = [df.columns.values[0], 'n_obs', 'prop_good']
df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()
df['n_good'] = df['prop_good'] * df['n_obs']
df['n_bad'] = (1 - df['prop_good']) * df['n_obs']
df['prop_n_good'] = df['n_good'] / df['n_good'].sum()
df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()
df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])
df = df.sort_values(['WoE'])
df = df.reset_index(drop = True)
df['diff_prop_good'] = df['prop_good'].diff().abs()
df['diff_WoE'] = df['WoE'].diff().abs()
df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']
df['IV'] = df['IV'].sum()
return df
#grade variable
#Executing the function and storing it in a dataframe
df_temp = woe_discrete(df_inputs_prepr, 'grade', df_targets_prepr)
df_temp
The Informaton Value(IV) falls within 0.3 and 0.5, indicating strong predictive power. That is, 0.3<IV<0.5
Visualizing the Preprocessed variables
#We define a function that takes 2 arguments: a dataframe and a number.
def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0):
# Turns the values of the column with index 0 to strings, makes an array from these strings, and passes it to variable x.
x = np.array(df_WoE.iloc[:, 0].apply(str))
# Selects a column with label 'WoE' and passes it to variable y.
y = df_WoE['WoE']
#Plotting the figure
plt.figure(figsize=(18, 6))
plt.plot(x, y, marker = 'o', linestyle = '--', color = 'k')
plt.xlabel(df_WoE.columns[0])
plt.ylabel('Weight of Evidence')
plt.title(str('Weight of Evidence by ' + df_WoE.columns[0]))
plt.xticks(rotation = rotation_of_x_axis_labels)
plot_by_woe(df_temp)
The greater the grade, the greater the weight of evidence. That means loans with greater external ratings are greater on avaerage
Preprocessing Discrete Variables: Creating Dummy Variables
#home_ownership variable
#Executing our previous WOE function
df_temp1 = woe_discrete(df_inputs_prepr, 'home_ownership', df_targets_prepr)
df_temp1
#Plotting the weight of evidence (woe) values by excuting the plot function we created previously.
plot_by_woe(df_temp1)
# There are many categories with home_ownership variable.
# Therefore, we create a new discrete variable where we combine some of the categories.
# 'OTHER', 'ANY' and 'NONE' are riskiest but are very few. 'RENT' is the next riskiest.
# We combine them in one category, 'RENT_OTHER_NONE_ANY'.
# We end up with 3 categories for the 'home_onership': 'RENT_OTHER_NONE_ANY', 'OWN', 'MORTGAGE'.
df_inputs_prepr['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_inputs_prepr['home_ownership:RENT'], df_inputs_prepr['home_ownership:OTHER'],
df_inputs_prepr['home_ownership:NONE'],df_inputs_prepr['home_ownership:ANY']])
#Unique caterogies in the addr_state variable
df_inputs_prepr['addr_state'].unique()
#addr_state variable
# We calculate weight of evidence.
df_temp2 = woe_discrete(df_inputs_prepr, 'addr_state', df_targets_prepr)
df_temp2
# We plot the weight of evidence values.
plot_by_woe(df_temp2)
#We want to get a normal curve
if ['addr_state:ND'] in df_inputs_prepr.columns.values:
pass
else:
df_inputs_prepr['addr_state:ND'] = 0
# We plot the weight of evidence values again for the 'addr_state' by removing the state IA and AL.
plot_by_woe(df_temp2.iloc[2: -2, : ])
# We plot the weight of evidence values again by removing the upper states OR, NH, DC, ID, VT, ME.
plot_by_woe(df_temp2.iloc[6: -6, : ])
# Creating dummies for the 'addr_state' variable
# We create the following categories:
# 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'
# 'NM' 'VA'
# 'NY'
# 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'
# 'CA'
# 'UT' 'KY' 'AZ' 'NJ'
# 'AR' 'MI' 'PA' 'OH' 'MN'
# 'RI' 'MA' 'DE' 'SD' 'IN'
# 'GA' 'WA' 'OR'
# 'WI' 'MT'
# 'TX'
# 'IL' 'CT'
# 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'
# 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'
# 'ND_NE_IA_NV_FL_HI_AL' will be the reference category.
df_inputs_prepr['addr_state:ND_NE_IA_NV_FL_HI_AL'] = sum([df_inputs_prepr['addr_state:ND'], df_inputs_prepr['addr_state:NE'],
df_inputs_prepr['addr_state:IA'], df_inputs_prepr['addr_state:NV'],
df_inputs_prepr['addr_state:FL'], df_inputs_prepr['addr_state:HI'],
df_inputs_prepr['addr_state:AL']])
df_inputs_prepr['addr_state:NM_VA'] = sum([df_inputs_prepr['addr_state:NM'], df_inputs_prepr['addr_state:VA']])
df_inputs_prepr['addr_state:OK_TN_MO_LA_MD_NC'] = sum([df_inputs_prepr['addr_state:OK'], df_inputs_prepr['addr_state:TN'],
df_inputs_prepr['addr_state:MO'], df_inputs_prepr['addr_state:LA'],
df_inputs_prepr['addr_state:MD'], df_inputs_prepr['addr_state:NC']])
df_inputs_prepr['addr_state:UT_KY_AZ_NJ'] = sum([df_inputs_prepr['addr_state:UT'], df_inputs_prepr['addr_state:KY'],
df_inputs_prepr['addr_state:AZ'], df_inputs_prepr['addr_state:NJ']])
df_inputs_prepr['addr_state:AR_MI_PA_OH_MN'] = sum([df_inputs_prepr['addr_state:AR'], df_inputs_prepr['addr_state:MI'],
df_inputs_prepr['addr_state:PA'], df_inputs_prepr['addr_state:OH'],
df_inputs_prepr['addr_state:MN']])
df_inputs_prepr['addr_state:RI_MA_DE_SD_IN'] = sum([df_inputs_prepr['addr_state:RI'], df_inputs_prepr['addr_state:MA'],
df_inputs_prepr['addr_state:DE'], df_inputs_prepr['addr_state:SD'],
df_inputs_prepr['addr_state:IN']])
df_inputs_prepr['addr_state:GA_WA_OR'] = sum([df_inputs_prepr['addr_state:GA'], df_inputs_prepr['addr_state:WA'],
df_inputs_prepr['addr_state:OR']])
df_inputs_prepr['addr_state:WI_MT'] = sum([df_inputs_prepr['addr_state:WI'], df_inputs_prepr['addr_state:MT']])
df_inputs_prepr['addr_state:IL_CT'] = sum([df_inputs_prepr['addr_state:IL'], df_inputs_prepr['addr_state:CT']])
df_inputs_prepr['addr_state:KS_SC_CO_VT_AK_MS'] = sum([df_inputs_prepr['addr_state:KS'], df_inputs_prepr['addr_state:SC'],
df_inputs_prepr['addr_state:CO'], df_inputs_prepr['addr_state:VT'],
df_inputs_prepr['addr_state:AK'], df_inputs_prepr['addr_state:MS']])
df_inputs_prepr['addr_state:WV_NH_WY_DC_ME_ID'] = sum([df_inputs_prepr['addr_state:WV'], df_inputs_prepr['addr_state:NH'],
df_inputs_prepr['addr_state:WY'], df_inputs_prepr['addr_state:DC'],
df_inputs_prepr['addr_state:ME'], df_inputs_prepr['addr_state:ID']])
# 'verification_status' variable'
# We calculate weight of evidence.
df_temp3 = woe_discrete(df_inputs_prepr, 'verification_status', df_targets_prepr)
df_temp3
# We plot the weight of evidence values.
plot_by_woe(df_temp3)
# 'purpose' variable
# We calculate weight of evidence.
df_temp4 = woe_discrete(df_inputs_prepr, 'purpose', df_targets_prepr)
df_temp4
plot_by_woe(df_temp4, 90)
# We plot the weight of evidence values.
# We create dummy variables for the 'purpose' variable
# We combine 'educational', 'small_business', 'wedding', 'renewable_energy', 'moving', 'house' in one category: 'educ__sm_b__wedd__ren_en__mov__house'.
# We combine 'other', 'medical', 'vacation' in one category: 'oth__med__vacation'.
# We combine 'major_purchase', 'car', 'home_improvement' in one category: 'major_purch__car__home_impr'.
# We leave 'debt_consolidtion' in a separate category.
# We leave 'credit_card' in a separate category.
# 'educ__sm_b__wedd__ren_en__mov__house' will be the reference category.
df_inputs_prepr['purpose:educ__sm_b__wedd__ren_en__mov__house'] = sum([df_inputs_prepr['purpose:educational'], df_inputs_prepr['purpose:small_business'],
df_inputs_prepr['purpose:wedding'], df_inputs_prepr['purpose:renewable_energy'],
df_inputs_prepr['purpose:moving'], df_inputs_prepr['purpose:house']])
df_inputs_prepr['purpose:oth__med__vacation'] = sum([df_inputs_prepr['purpose:other'], df_inputs_prepr['purpose:medical'],
df_inputs_prepr['purpose:vacation']])
df_inputs_prepr['purpose:major_purch__car__home_impr'] = sum([df_inputs_prepr['purpose:major_purchase'], df_inputs_prepr['purpose:car'],
df_inputs_prepr['purpose:home_improvement']])
# 'initial_list_status' variable
# We calculate weight of evidence.
df_temp5 = woe_discrete(df_inputs_prepr, 'initial_list_status', df_targets_prepr)
df_temp5
# We plot the weight of evidence values.
plot_by_woe(df_temp5)
Preprocessing Continuous Variables: Creating dummy variables
# WoE function for ordered discrete and continuous variables
# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.
def woe_ordered_continuous(df, discrete_variabe_name, good_bad_variable_df):
df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)
df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
df = df.iloc[:, [0, 1, 3]]
df.columns = [df.columns.values[0], 'n_obs', 'prop_good']
df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()
df['n_good'] = df['prop_good'] * df['n_obs']
df['n_bad'] = (1 - df['prop_good']) * df['n_obs']
df['prop_n_good'] = df['n_good'] / df['n_good'].sum()
df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()
df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])
#df = df.sort_values(['WoE'])
#df = df.reset_index(drop = True)
df['diff_prop_good'] = df['prop_good'].diff().abs()
df['diff_WoE'] = df['WoE'].diff().abs()
df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']
df['IV'] = df['IV'].sum()
return df
# term variable
# There are only two unique values, 36 and 60.
df_inputs_prepr['term_int'].unique()
# We calculate weight of evidence.
df_temp6 = woe_ordered_continuous(df_inputs_prepr, 'term_int', df_targets_prepr)
df_temp6
# We plot the weight of evidence values.
plot_by_woe(df_temp6)
It seems 60 months loans are much risky than 36 months loans
# We will keep both the 36 and 60 months category.
# However the '60' months will be the reference category.
df_inputs_prepr['term:36'] = np.where((df_inputs_prepr['term_int'] == 36), 1, 0)
df_inputs_prepr['term:60'] = np.where((df_inputs_prepr['term_int'] == 60), 1, 0)
# emp_length_int variable
# Has only 11 levels: from 0 to 10. Hence, we turn it into a factor with 11 levels.
df_inputs_prepr['emp_length_int'].unique()
# We calculate weight of evidence.
df_temp7 = woe_ordered_continuous(df_inputs_prepr, 'emp_length_int', df_targets_prepr)
df_temp7
# We plot the weight of evidence values.
plot_by_woe(df_temp7)
# Employment length has several categories
# So we have to create the following new categories for emp_length_int: '0', '1', '2 - 4', '5 - 6', '7 - 9', '10'
# '0' will be the reference category
df_inputs_prepr['emp_length:0'] = np.where(df_inputs_prepr['emp_length_int'].isin([0]), 1, 0)
df_inputs_prepr['emp_length:1'] = np.where(df_inputs_prepr['emp_length_int'].isin([1]), 1, 0)
df_inputs_prepr['emp_length:2-4'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(2, 5)), 1, 0)
df_inputs_prepr['emp_length:5-6'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(5, 7)), 1, 0)
df_inputs_prepr['emp_length:7-9'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(7, 10)), 1, 0)
df_inputs_prepr['emp_length:10'] = np.where(df_inputs_prepr['emp_length_int'].isin([10]), 1, 0)
# Months since loan issue date (mths_since_issue_d) variable
df_inputs_prepr['mths_since_issue_d'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['mths_since_issue_d_factor'] = pd.cut(df_inputs_prepr['mths_since_issue_d'], 50)
df_inputs_prepr['mths_since_issue_d_factor']
# mths_since_issue_d
# We calculate weight of evidence.
df_temp8 = woe_ordered_continuous(df_inputs_prepr, 'mths_since_issue_d_factor', df_targets_prepr)
df_temp8
# We plot the weight of evidence values.
plot_by_woe(df_temp8)
We have to rotate the labels because we cannot read them otherwise.
# We plot the weight of evidence values, rotating the labels 90 degrees.
plot_by_woe(df_temp8, 90)
# We plot the weight of evidence values.
plot_by_woe(df_temp8.iloc[3: , : ], 90)
# We create the following categories:
# < 38, 38 - 39, 40 - 41, 42 - 48, 49 - 52, 53 - 64, 65 - 84, > 84.
df_inputs_prepr['mths_since_issue_d:<38'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38)), 1, 0)
df_inputs_prepr['mths_since_issue_d:38-39'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38, 40)), 1, 0)
df_inputs_prepr['mths_since_issue_d:40-41'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(40, 42)), 1, 0)
df_inputs_prepr['mths_since_issue_d:42-48'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(42, 49)), 1, 0)
df_inputs_prepr['mths_since_issue_d:49-52'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(49, 53)), 1, 0)
df_inputs_prepr['mths_since_issue_d:53-64'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(53, 65)), 1, 0)
df_inputs_prepr['mths_since_issue_d:65-84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(65, 85)), 1, 0)
df_inputs_prepr['mths_since_issue_d:>84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(85, int(df_inputs_prepr['mths_since_issue_d'].max()))), 1, 0)
# int_rate variable
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['int_rate_factor'] = pd.cut(df_inputs_prepr['int_rate'], 50)
# We calculate weight of evidence.
df_temp9 = woe_ordered_continuous(df_inputs_prepr, 'int_rate_factor', df_targets_prepr)
df_temp9
# We plot the weight of evidence values.
plot_by_woe(df_temp9, 90)
# We create the following categories:
# '< 9.548', '9.548 - 12.025', '12.025 - 15.74', '15.74 - 20.281', '> 20.281'
df_inputs_prepr['int_rate:<9.548'] = np.where((df_inputs_prepr['int_rate'] <= 9.548), 1, 0)
df_inputs_prepr['int_rate:9.548-12.025'] = np.where((df_inputs_prepr['int_rate'] > 9.548) & (df_inputs_prepr['int_rate'] <= 12.025), 1, 0)
df_inputs_prepr['int_rate:12.025-15.74'] = np.where((df_inputs_prepr['int_rate'] > 12.025) & (df_inputs_prepr['int_rate'] <= 15.74), 1, 0)
df_inputs_prepr['int_rate:15.74-20.281'] = np.where((df_inputs_prepr['int_rate'] > 15.74) & (df_inputs_prepr['int_rate'] <= 20.281), 1, 0)
df_inputs_prepr['int_rate:>20.281'] = np.where((df_inputs_prepr['int_rate'] > 20.281), 1, 0)
# funded_amnt variable
df_inputs_prepr['funded_amnt'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['funded_amnt_factor'] = pd.cut(df_inputs_prepr['funded_amnt'], 50)
# We calculate weight of evidence.
df_temp10 = woe_ordered_continuous(df_inputs_prepr, 'funded_amnt_factor', df_targets_prepr)
df_temp10
# We plot the weight of evidence values.
plot_by_woe(df_temp10, 90)
# mths_since_earliest_cr_line variable
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['mths_since_earliest_cr_line_factor'] = pd.cut(df_inputs_prepr['mths_since_earliest_cr_line'], 50)
# We calculate weight of evidence.
df_temp11 = woe_ordered_continuous(df_inputs_prepr, 'mths_since_earliest_cr_line_factor', df_targets_prepr)
df_temp11
# We plot the weight of evidence values.
plot_by_woe(df_temp11, 90)
# We plot the weight of evidence values
plot_by_woe(df_temp11.iloc[6: , : ], 90)
# We create the following categories:
# < 140, # 141 - 164, # 165 - 247, # 248 - 270, # 271 - 352, # > 352
df_inputs_prepr['mths_since_earliest_cr_line:<140'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:141-164'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140, 165)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:165-247'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(165, 248)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:248-270'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(248, 271)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:271-352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(271, 353)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:>352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(353, int(df_inputs_prepr['mths_since_earliest_cr_line'].max()))), 1, 0)
# delinq_2yrs variable
# We calculate weight of evidence
df_temp12 = woe_ordered_continuous(df_inputs_prepr, 'delinq_2yrs', df_targets_prepr)
df_temp12
# We plot the weight of evidence values
plot_by_woe(df_temp12)
# We create the following Categories: 0, 1-3, >=4
df_inputs_prepr['delinq_2yrs:0'] = np.where((df_inputs_prepr['delinq_2yrs'] == 0), 1, 0)
df_inputs_prepr['delinq_2yrs:1-3'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 1) & (df_inputs_prepr['delinq_2yrs'] <= 3), 1, 0)
df_inputs_prepr['delinq_2yrs:>=4'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 9), 1, 0)
# inq_last_6mths variable
# We calculate weight of evidence.
df_temp13 = woe_ordered_continuous(df_inputs_prepr, 'inq_last_6mths', df_targets_prepr)
df_temp13
# We plot the weight of evidence values
plot_by_woe(df_temp13)
# We create the following Categories: 0, 1 - 2, 3 - 6, > 6
df_inputs_prepr['inq_last_6mths:0'] = np.where((df_inputs_prepr['inq_last_6mths'] == 0), 1, 0)
df_inputs_prepr['inq_last_6mths:1-2'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 1) & (df_inputs_prepr['inq_last_6mths'] <= 2), 1, 0)
df_inputs_prepr['inq_last_6mths:3-6'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 3) & (df_inputs_prepr['inq_last_6mths'] <= 6), 1, 0)
df_inputs_prepr['inq_last_6mths:>6'] = np.where((df_inputs_prepr['inq_last_6mths'] > 6), 1, 0)
# open_acc variable
# We calculate weight of evidence.
df_temp14 = woe_ordered_continuous(df_inputs_prepr, 'open_acc', df_targets_prepr)
df_temp14
# We plot the weight of evidence values
plot_by_woe(df_temp14, 90)
# We plot the weight of evidence values
plot_by_woe(df_temp14.iloc[ : 40, :], 90)
# We create the following Categories: '0', '1-3', '4-12', '13-17', '18-22', '23-25', '26-30', '>30'
df_inputs_prepr['open_acc:0'] = np.where((df_inputs_prepr['open_acc'] == 0), 1, 0)
df_inputs_prepr['open_acc:1-3'] = np.where((df_inputs_prepr['open_acc'] >= 1) & (df_inputs_prepr['open_acc'] <= 3), 1, 0)
df_inputs_prepr['open_acc:4-12'] = np.where((df_inputs_prepr['open_acc'] >= 4) & (df_inputs_prepr['open_acc'] <= 12), 1, 0)
df_inputs_prepr['open_acc:13-17'] = np.where((df_inputs_prepr['open_acc'] >= 13) & (df_inputs_prepr['open_acc'] <= 17), 1, 0)
df_inputs_prepr['open_acc:18-22'] = np.where((df_inputs_prepr['open_acc'] >= 18) & (df_inputs_prepr['open_acc'] <= 22), 1, 0)
df_inputs_prepr['open_acc:23-25'] = np.where((df_inputs_prepr['open_acc'] >= 23) & (df_inputs_prepr['open_acc'] <= 25), 1, 0)
df_inputs_prepr['open_acc:26-30'] = np.where((df_inputs_prepr['open_acc'] >= 26) & (df_inputs_prepr['open_acc'] <= 30), 1, 0)
df_inputs_prepr['open_acc:>=31'] = np.where((df_inputs_prepr['open_acc'] >= 31), 1, 0)
# pub_rec variable
# We calculate weight of evidence
df_temp15 = woe_ordered_continuous(df_inputs_prepr, 'pub_rec', df_targets_prepr)
df_temp15
# We plot the weight of evidence values.
plot_by_woe(df_temp15, 90)
# We create the following Categories: '0-2', '3-4', '>=5'
df_inputs_prepr['pub_rec:0-2'] = np.where((df_inputs_prepr['pub_rec'] >= 0) & (df_inputs_prepr['pub_rec'] <= 2), 1, 0)
df_inputs_prepr['pub_rec:3-4'] = np.where((df_inputs_prepr['pub_rec'] >= 3) & (df_inputs_prepr['pub_rec'] <= 4), 1, 0)
df_inputs_prepr['pub_rec:>=5'] = np.where((df_inputs_prepr['pub_rec'] >= 5), 1, 0)
# total_acc variable
df_inputs_prepr['total_acc'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['total_acc_factor'] = pd.cut(df_inputs_prepr['total_acc'], 50)
# We calculate weight of evidence
df_temp16 = woe_ordered_continuous(df_inputs_prepr, 'total_acc_factor', df_targets_prepr)
df_temp16
# We plot the weight of evidence values
plot_by_woe(df_temp16, 90)
# We create the following Categories: '<=27', '28-51', '>51'
df_inputs_prepr['total_acc:<=27'] = np.where((df_inputs_prepr['total_acc'] <= 27), 1, 0)
df_inputs_prepr['total_acc:28-51'] = np.where((df_inputs_prepr['total_acc'] >= 28) & (df_inputs_prepr['total_acc'] <= 51), 1, 0)
df_inputs_prepr['total_acc:>=52'] = np.where((df_inputs_prepr['total_acc'] >= 52), 1, 0)
# acc_now_delinq variable
# We calculate weight of evidence.
df_temp17 = woe_ordered_continuous(df_inputs_prepr, 'acc_now_delinq', df_targets_prepr)
df_temp17
# We plot the weight of evidence values
plot_by_woe(df_temp17)
# We create the following Categories: '0', '>=1'
df_inputs_prepr['acc_now_delinq:0'] = np.where((df_inputs_prepr['acc_now_delinq'] == 0), 1, 0)
df_inputs_prepr['acc_now_delinq:>=1'] = np.where((df_inputs_prepr['acc_now_delinq'] >= 1), 1, 0)
# total_rev_hi_lim variable
df_inputs_prepr['total_rev_hi_lim'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 2000 categories by its values.
df_inputs_prepr['total_rev_hi_lim_factor'] = pd.cut(df_inputs_prepr['total_rev_hi_lim'], 2000)
# We calculate weight of evidence.
df_temp18 = woe_ordered_continuous(df_inputs_prepr, 'total_rev_hi_lim_factor', df_targets_prepr)
df_temp18
# We plot the weight of evidence values.
plot_by_woe(df_temp18.iloc[: 50, : ], 90)
# We create the following Categories:'<=5K', '5K-10K', '10K-20K', '20K-30K', '30K-40K', '40K-55K', '55K-95K', '>95K'
df_inputs_prepr['total_rev_hi_lim:<=5K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] <= 5000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:5K-10K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 5000) & (df_inputs_prepr['total_rev_hi_lim'] <= 10000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:10K-20K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 10000) & (df_inputs_prepr['total_rev_hi_lim'] <= 20000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:20K-30K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 20000) & (df_inputs_prepr['total_rev_hi_lim'] <= 30000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:30K-40K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 30000) & (df_inputs_prepr['total_rev_hi_lim'] <= 40000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:40K-55K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 40000) & (df_inputs_prepr['total_rev_hi_lim'] <= 55000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:55K-95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 55000) & (df_inputs_prepr['total_rev_hi_lim'] <= 95000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:>95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 95000), 1, 0)
# installment variable
df_inputs_prepr['installment'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50)
# We calculate weight of evidence.
df_temp19 = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_targets_prepr)
df_temp19
# We plot the weight of evidence values.
plot_by_woe(df_temp19, 90)
# annual_inc variable
df_inputs_prepr['annual_inc'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)
# We calculate weight of evidence.
df_temp20 = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)
df_temp20
# Splitting the initial ‘annual income’ variable into 50 categories doesn't work well for fine classing because there are a lot of people with low income and very few people with high income.
# Thus, we do fine-classing using the 'cut' method, we split the variable into 100 categories by its values.
df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)
# We calculate weight of evidence.
df_temp20 = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)
df_temp20
# Initial examination shows that there are too few individuals with large income and too many with small income.
# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine
# the categories of everyone with 140k or less.
df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)
# We calculate weight of evidence.
df_temp20 = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_targets_prepr[df_inputs_prepr_temp.index])
df_temp20
# We plot the weight of evidence values.
plot_by_woe(df_temp20, 90)
# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 15k.
df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)
df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)
df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)
df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)
df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)
df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)
df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)
df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)
df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)
df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)
df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)
df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)
# mths_since_last_delinq variable
# We have to create one category for missing values and do fine and coarse classing for the rest.
df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]
df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)
# We calculate weight of evidence.
df_temp21 = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_targets_prepr[df_inputs_prepr_temp.index])
df_temp21
# We plot the weight of evidence values.
plot_by_woe(df_temp21, 90)
# We create the following Categories: Missing, 0-3, 4-30, 31-56, >=57
df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)
df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)
df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)
df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)
df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)
# dti variable
# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.
df_inputs_prepr['dti_factor'] = pd.cut(df_inputs_prepr['dti'], 100)
# We calculate weight of evidence.
df_temp22 = woe_ordered_continuous(df_inputs_prepr, 'dti_factor', df_targets_prepr)
df_temp22
# We plot the weight of evidence values.
plot_by_woe(df_temp22, 90)
# Similarly to income, initial examination shows that most values are lower than 200.
# Hence, we are going to have one category for more than 35, and we are going to apply our approach to determine
# the categories of everyone with 150k or less.
df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['dti'] <= 35, : ]
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr_temp['dti_factor'] = pd.cut(df_inputs_prepr_temp['dti'], 50)
# We calculate weight of evidence.
df_temp22 = woe_ordered_continuous(df_inputs_prepr_temp, 'dti_factor', df_targets_prepr[df_inputs_prepr_temp.index])
df_temp22
# We plot the weight of evidence values.
plot_by_woe(df_temp22, 90)
# We create the following Categories:
df_inputs_prepr['dti:<=1.4'] = np.where((df_inputs_prepr['dti'] <= 1.4), 1, 0)
df_inputs_prepr['dti:1.4-3.5'] = np.where((df_inputs_prepr['dti'] > 1.4) & (df_inputs_prepr['dti'] <= 3.5), 1, 0)
df_inputs_prepr['dti:3.5-7.7'] = np.where((df_inputs_prepr['dti'] > 3.5) & (df_inputs_prepr['dti'] <= 7.7), 1, 0)
df_inputs_prepr['dti:7.7-10.5'] = np.where((df_inputs_prepr['dti'] > 7.7) & (df_inputs_prepr['dti'] <= 10.5), 1, 0)
df_inputs_prepr['dti:10.5-16.1'] = np.where((df_inputs_prepr['dti'] > 10.5) & (df_inputs_prepr['dti'] <= 16.1), 1, 0)
df_inputs_prepr['dti:16.1-20.3'] = np.where((df_inputs_prepr['dti'] > 16.1) & (df_inputs_prepr['dti'] <= 20.3), 1, 0)
df_inputs_prepr['dti:20.3-21.7'] = np.where((df_inputs_prepr['dti'] > 20.3) & (df_inputs_prepr['dti'] <= 21.7), 1, 0)
df_inputs_prepr['dti:21.7-22.4'] = np.where((df_inputs_prepr['dti'] > 21.7) & (df_inputs_prepr['dti'] <= 22.4), 1, 0)
df_inputs_prepr['dti:22.4-35'] = np.where((df_inputs_prepr['dti'] > 22.4) & (df_inputs_prepr['dti'] <= 35), 1, 0)
df_inputs_prepr['dti:>35'] = np.where((df_inputs_prepr['dti'] > 35), 1, 0)
# mths_since_last_record variable
# We have to create one category for missing values and do fine and coarse classing for the rest.
#sum(loan_data_temp['mths_since_last_record'].isnull())
df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_record'])]
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr_temp['mths_since_last_record_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_record'], 50)
# We calculate weight of evidence.
df_temp23 = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_record_factor', df_targets_prepr[df_inputs_prepr_temp.index])
df_temp23
# We plot the weight of evidence values.
plot_by_woe(df_temp23, 90)
# We create the following Categories: 'Missing', '0-2', '3-20', '21-31', '32-80', '81-86', '>86'
df_inputs_prepr['mths_since_last_record:Missing'] = np.where((df_inputs_prepr['mths_since_last_record'].isnull()), 1, 0)
df_inputs_prepr['mths_since_last_record:0-2'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 0) & (df_inputs_prepr['mths_since_last_record'] <= 2), 1, 0)
df_inputs_prepr['mths_since_last_record:3-20'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 3) & (df_inputs_prepr['mths_since_last_record'] <= 20), 1, 0)
df_inputs_prepr['mths_since_last_record:21-31'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 21) & (df_inputs_prepr['mths_since_last_record'] <= 31), 1, 0)
df_inputs_prepr['mths_since_last_record:32-80'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 32) & (df_inputs_prepr['mths_since_last_record'] <= 80), 1, 0)
df_inputs_prepr['mths_since_last_record:81-86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 81) & (df_inputs_prepr['mths_since_last_record'] <= 86), 1, 0)
df_inputs_prepr['mths_since_last_record:>86'] = np.where((df_inputs_prepr['mths_since_last_record'] > 86), 1, 0)
We will do the same like we did for the train dataset above
df_inputs_prepr = loan_data_inputs_test
df_targets_prepr = loan_data_targets_test
# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.
# WoE function for discrete unordered variables
def woe_discrete(df, discrete_variabe_name, good_bad_variable_df):
df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)
df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
df = df.iloc[:, [0, 1, 3]]
df.columns = [df.columns.values[0], 'n_obs', 'prop_good']
df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()
df['n_good'] = df['prop_good'] * df['n_obs']
df['n_bad'] = (1 - df['prop_good']) * df['n_obs']
df['prop_n_good'] = df['n_good'] / df['n_good'].sum()
df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()
df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])
df = df.sort_values(['WoE'])
df = df.reset_index(drop = True)
df['diff_prop_good'] = df['prop_good'].diff().abs()
df['diff_WoE'] = df['WoE'].diff().abs()
df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']
df['IV'] = df['IV'].sum()
return df
#grade variable
#Executing the function and storing it in a dataframe
df_temp = woe_discrete(df_inputs_prepr, 'grade', df_targets_prepr)
df_temp
#We define a function that takes 2 arguments: a dataframe and a number.
def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0):
# Turns the values of the column with index 0 to strings, makes an array from these strings, and passes it to variable x.
x = np.array(df_WoE.iloc[:, 0].apply(str))
# Selects a column with label 'WoE' and passes it to variable y.
y = df_WoE['WoE']
#Plotting the figure
plt.figure(figsize=(18, 6))
plt.plot(x, y, marker = 'o', linestyle = '--', color = 'k')
plt.xlabel(df_WoE.columns[0])
plt.ylabel('Weight of Evidence')
plt.title(str('Weight of Evidence by ' + df_WoE.columns[0]))
plt.xticks(rotation = rotation_of_x_axis_labels)
plot_by_woe(df_temp)
The greater the grade, the greater the weight of evidence. That means loans with greater external ratings are greater on avaerage
#home_ownership variable
#Executing our previous WOE function
df_temp1 = woe_discrete(df_inputs_prepr, 'home_ownership', df_targets_prepr)
df_temp1
#Plotting the weight of evidence (woe) values by excuting the plot function we created previously.
plot_by_woe(df_temp1)
# There are many categories with home_ownership variable.
# Therefore, we create a new discrete variable where we combine some of the categories.
# 'OTHER', 'ANY' and 'NONE' are riskiest but are very few. 'RENT' is the next riskiest.
# We combine them in one category, 'RENT_OTHER_NONE_ANY'.
# We end up with 3 categories for the 'home_onership': 'RENT_OTHER_NONE_ANY', 'OWN', 'MORTGAGE'.
df_inputs_prepr['home_ownership:RENT_OTHER_NONE_ANY'] = sum([df_inputs_prepr['home_ownership:RENT'], df_inputs_prepr['home_ownership:OTHER'],
df_inputs_prepr['home_ownership:NONE'],df_inputs_prepr['home_ownership:ANY']])
#Unique caterogies in the addr_state variable
df_inputs_prepr['addr_state'].unique()
#addr_state variable
# We calculate weight of evidence.
df_temp2 = woe_discrete(df_inputs_prepr, 'addr_state', df_targets_prepr)
df_temp2
# We plot the weight of evidence values.
plot_by_woe(df_temp2)
#We want to get a normal curve
if ['addr_state:ND'] in df_inputs_prepr.columns.values:
pass
else:
df_inputs_prepr['addr_state:ND'] = 0
# We plot the weight of evidence values again for the 'addr_state' by removing the state IA and AL.
plot_by_woe(df_temp2.iloc[2: -2, : ])
# We plot the weight of evidence values again by removing the upper states OR, NH, DC, ID, VT, ME.
plot_by_woe(df_temp2.iloc[6: -6, : ])
# Creating dummies for the 'addr_state' variable
# We create the following categories:
# 'ND' 'NE' 'IA' NV' 'FL' 'HI' 'AL'
# 'NM' 'VA'
# 'NY'
# 'OK' 'TN' 'MO' 'LA' 'MD' 'NC'
# 'CA'
# 'UT' 'KY' 'AZ' 'NJ'
# 'AR' 'MI' 'PA' 'OH' 'MN'
# 'RI' 'MA' 'DE' 'SD' 'IN'
# 'GA' 'WA' 'OR'
# 'WI' 'MT'
# 'TX'
# 'IL' 'CT'
# 'KS' 'SC' 'CO' 'VT' 'AK' 'MS'
# 'WV' 'NH' 'WY' 'DC' 'ME' 'ID'
# 'ND_NE_IA_NV_FL_HI_AL' will be the reference category.
df_inputs_prepr['addr_state:ND_NE_IA_NV_FL_HI_AL'] = sum([df_inputs_prepr['addr_state:ND'], df_inputs_prepr['addr_state:NE'],
df_inputs_prepr['addr_state:IA'], df_inputs_prepr['addr_state:NV'],
df_inputs_prepr['addr_state:FL'], df_inputs_prepr['addr_state:HI'],
df_inputs_prepr['addr_state:AL']])
df_inputs_prepr['addr_state:NM_VA'] = sum([df_inputs_prepr['addr_state:NM'], df_inputs_prepr['addr_state:VA']])
df_inputs_prepr['addr_state:OK_TN_MO_LA_MD_NC'] = sum([df_inputs_prepr['addr_state:OK'], df_inputs_prepr['addr_state:TN'],
df_inputs_prepr['addr_state:MO'], df_inputs_prepr['addr_state:LA'],
df_inputs_prepr['addr_state:MD'], df_inputs_prepr['addr_state:NC']])
df_inputs_prepr['addr_state:UT_KY_AZ_NJ'] = sum([df_inputs_prepr['addr_state:UT'], df_inputs_prepr['addr_state:KY'],
df_inputs_prepr['addr_state:AZ'], df_inputs_prepr['addr_state:NJ']])
df_inputs_prepr['addr_state:AR_MI_PA_OH_MN'] = sum([df_inputs_prepr['addr_state:AR'], df_inputs_prepr['addr_state:MI'],
df_inputs_prepr['addr_state:PA'], df_inputs_prepr['addr_state:OH'],
df_inputs_prepr['addr_state:MN']])
df_inputs_prepr['addr_state:RI_MA_DE_SD_IN'] = sum([df_inputs_prepr['addr_state:RI'], df_inputs_prepr['addr_state:MA'],
df_inputs_prepr['addr_state:DE'], df_inputs_prepr['addr_state:SD'],
df_inputs_prepr['addr_state:IN']])
df_inputs_prepr['addr_state:GA_WA_OR'] = sum([df_inputs_prepr['addr_state:GA'], df_inputs_prepr['addr_state:WA'],
df_inputs_prepr['addr_state:OR']])
df_inputs_prepr['addr_state:WI_MT'] = sum([df_inputs_prepr['addr_state:WI'], df_inputs_prepr['addr_state:MT']])
df_inputs_prepr['addr_state:IL_CT'] = sum([df_inputs_prepr['addr_state:IL'], df_inputs_prepr['addr_state:CT']])
df_inputs_prepr['addr_state:KS_SC_CO_VT_AK_MS'] = sum([df_inputs_prepr['addr_state:KS'], df_inputs_prepr['addr_state:SC'],
df_inputs_prepr['addr_state:CO'], df_inputs_prepr['addr_state:VT'],
df_inputs_prepr['addr_state:AK'], df_inputs_prepr['addr_state:MS']])
df_inputs_prepr['addr_state:WV_NH_WY_DC_ME_ID'] = sum([df_inputs_prepr['addr_state:WV'], df_inputs_prepr['addr_state:NH'],
df_inputs_prepr['addr_state:WY'], df_inputs_prepr['addr_state:DC'],
df_inputs_prepr['addr_state:ME'], df_inputs_prepr['addr_state:ID']])
# 'verification_status' variable'
# We calculate weight of evidence.
df_temp3 = woe_discrete(df_inputs_prepr, 'verification_status', df_targets_prepr)
df_temp3
# We plot the weight of evidence values.
plot_by_woe(df_temp3)
# 'purpose' variable
# We calculate weight of evidence.
df_temp4 = woe_discrete(df_inputs_prepr, 'purpose', df_targets_prepr)
df_temp4
plot_by_woe(df_temp4, 90)
# We plot the weight of evidence values.
# We create dummy variables for the 'purpose' variable
# We combine 'educational', 'small_business', 'wedding', 'renewable_energy', 'moving', 'house' in one category: 'educ__sm_b__wedd__ren_en__mov__house'.
# We combine 'other', 'medical', 'vacation' in one category: 'oth__med__vacation'.
# We combine 'major_purchase', 'car', 'home_improvement' in one category: 'major_purch__car__home_impr'.
# We leave 'debt_consolidtion' in a separate category.
# We leave 'credit_card' in a separate category.
# 'educ__sm_b__wedd__ren_en__mov__house' will be the reference category.
df_inputs_prepr['purpose:educ__sm_b__wedd__ren_en__mov__house'] = sum([df_inputs_prepr['purpose:educational'], df_inputs_prepr['purpose:small_business'],
df_inputs_prepr['purpose:wedding'], df_inputs_prepr['purpose:renewable_energy'],
df_inputs_prepr['purpose:moving'], df_inputs_prepr['purpose:house']])
df_inputs_prepr['purpose:oth__med__vacation'] = sum([df_inputs_prepr['purpose:other'], df_inputs_prepr['purpose:medical'],
df_inputs_prepr['purpose:vacation']])
df_inputs_prepr['purpose:major_purch__car__home_impr'] = sum([df_inputs_prepr['purpose:major_purchase'], df_inputs_prepr['purpose:car'],
df_inputs_prepr['purpose:home_improvement']])
# 'initial_list_status' variable
# We calculate weight of evidence.
df_temp5 = woe_discrete(df_inputs_prepr, 'initial_list_status', df_targets_prepr)
df_temp5
# We plot the weight of evidence values.
plot_by_woe(df_temp5)
# WoE function for ordered discrete and continuous variables
# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.
def woe_ordered_continuous(df, discrete_variabe_name, good_bad_variable_df):
df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)
df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
df = df.iloc[:, [0, 1, 3]]
df.columns = [df.columns.values[0], 'n_obs', 'prop_good']
df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()
df['n_good'] = df['prop_good'] * df['n_obs']
df['n_bad'] = (1 - df['prop_good']) * df['n_obs']
df['prop_n_good'] = df['n_good'] / df['n_good'].sum()
df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()
df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])
#df = df.sort_values(['WoE'])
#df = df.reset_index(drop = True)
df['diff_prop_good'] = df['prop_good'].diff().abs()
df['diff_WoE'] = df['WoE'].diff().abs()
df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']
df['IV'] = df['IV'].sum()
return df
# term variable
# There are only two unique values, 36 and 60.
df_inputs_prepr['term_int'].unique()
# We calculate weight of evidence.
df_temp6 = woe_ordered_continuous(df_inputs_prepr, 'term_int', df_targets_prepr)
df_temp6
# We plot the weight of evidence values.
plot_by_woe(df_temp6)
It seems 60 months loans are much risky than 36 months loans
# We will keep both the 36 and 60 months category.
# However the '60' months will be the reference category.
df_inputs_prepr['term:36'] = np.where((df_inputs_prepr['term_int'] == 36), 1, 0)
df_inputs_prepr['term:60'] = np.where((df_inputs_prepr['term_int'] == 60), 1, 0)
# emp_length_int variable
# Has only 11 levels: from 0 to 10. Hence, we turn it into a factor with 11 levels.
df_inputs_prepr['emp_length_int'].unique()
# We calculate weight of evidence.
df_temp7 = woe_ordered_continuous(df_inputs_prepr, 'emp_length_int', df_targets_prepr)
df_temp7
# We plot the weight of evidence values.
plot_by_woe(df_temp7)
# Employment length has several categories
# So we have to create the following new categories for emp_length_int: '0', '1', '2 - 4', '5 - 6', '7 - 9', '10'
# '0' will be the reference category
df_inputs_prepr['emp_length:0'] = np.where(df_inputs_prepr['emp_length_int'].isin([0]), 1, 0)
df_inputs_prepr['emp_length:1'] = np.where(df_inputs_prepr['emp_length_int'].isin([1]), 1, 0)
df_inputs_prepr['emp_length:2-4'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(2, 5)), 1, 0)
df_inputs_prepr['emp_length:5-6'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(5, 7)), 1, 0)
df_inputs_prepr['emp_length:7-9'] = np.where(df_inputs_prepr['emp_length_int'].isin(range(7, 10)), 1, 0)
df_inputs_prepr['emp_length:10'] = np.where(df_inputs_prepr['emp_length_int'].isin([10]), 1, 0)
# Months since loan issue date (mths_since_issue_d) variable
df_inputs_prepr['mths_since_issue_d'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['mths_since_issue_d_factor'] = pd.cut(df_inputs_prepr['mths_since_issue_d'], 50)
df_inputs_prepr['mths_since_issue_d_factor']
# mths_since_issue_d
# We calculate weight of evidence.
df_temp8 = woe_ordered_continuous(df_inputs_prepr, 'mths_since_issue_d_factor', df_targets_prepr)
df_temp8
# We plot the weight of evidence values.
plot_by_woe(df_temp8)
We have to rotate the labels because we cannot read them otherwise.
# We plot the weight of evidence values, rotating the labels 90 degrees.
plot_by_woe(df_temp8, 90)
# We plot the weight of evidence values.
plot_by_woe(df_temp8.iloc[3: , : ], 90)
# We create the following categories:
# < 38, 38 - 39, 40 - 41, 42 - 48, 49 - 52, 53 - 64, 65 - 84, > 84.
df_inputs_prepr['mths_since_issue_d:<38'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38)), 1, 0)
df_inputs_prepr['mths_since_issue_d:38-39'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(38, 40)), 1, 0)
df_inputs_prepr['mths_since_issue_d:40-41'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(40, 42)), 1, 0)
df_inputs_prepr['mths_since_issue_d:42-48'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(42, 49)), 1, 0)
df_inputs_prepr['mths_since_issue_d:49-52'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(49, 53)), 1, 0)
df_inputs_prepr['mths_since_issue_d:53-64'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(53, 65)), 1, 0)
df_inputs_prepr['mths_since_issue_d:65-84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(65, 85)), 1, 0)
df_inputs_prepr['mths_since_issue_d:>84'] = np.where(df_inputs_prepr['mths_since_issue_d'].isin(range(85, int(df_inputs_prepr['mths_since_issue_d'].max()))), 1, 0)
# int_rate variable
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['int_rate_factor'] = pd.cut(df_inputs_prepr['int_rate'], 50)
# We calculate weight of evidence.
df_temp9 = woe_ordered_continuous(df_inputs_prepr, 'int_rate_factor', df_targets_prepr)
df_temp9
# We plot the weight of evidence values.
plot_by_woe(df_temp9, 90)
# We create the following categories:
# '< 9.548', '9.548 - 12.025', '12.025 - 15.74', '15.74 - 20.281', '> 20.281'
df_inputs_prepr['int_rate:<9.548'] = np.where((df_inputs_prepr['int_rate'] <= 9.548), 1, 0)
df_inputs_prepr['int_rate:9.548-12.025'] = np.where((df_inputs_prepr['int_rate'] > 9.548) & (df_inputs_prepr['int_rate'] <= 12.025), 1, 0)
df_inputs_prepr['int_rate:12.025-15.74'] = np.where((df_inputs_prepr['int_rate'] > 12.025) & (df_inputs_prepr['int_rate'] <= 15.74), 1, 0)
df_inputs_prepr['int_rate:15.74-20.281'] = np.where((df_inputs_prepr['int_rate'] > 15.74) & (df_inputs_prepr['int_rate'] <= 20.281), 1, 0)
df_inputs_prepr['int_rate:>20.281'] = np.where((df_inputs_prepr['int_rate'] > 20.281), 1, 0)
# funded_amnt variable
df_inputs_prepr['funded_amnt'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['funded_amnt_factor'] = pd.cut(df_inputs_prepr['funded_amnt'], 50)
# We calculate weight of evidence.
df_temp10 = woe_ordered_continuous(df_inputs_prepr, 'funded_amnt_factor', df_targets_prepr)
df_temp10
# We plot the weight of evidence values.
plot_by_woe(df_temp10, 90)
# mths_since_earliest_cr_line variable
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['mths_since_earliest_cr_line_factor'] = pd.cut(df_inputs_prepr['mths_since_earliest_cr_line'], 50)
# We calculate weight of evidence.
df_temp11 = woe_ordered_continuous(df_inputs_prepr, 'mths_since_earliest_cr_line_factor', df_targets_prepr)
df_temp11
# We plot the weight of evidence values.
plot_by_woe(df_temp11, 90)
# We plot the weight of evidence values
plot_by_woe(df_temp11.iloc[6: , : ], 90)
# We create the following categories:
# < 140, # 141 - 164, # 165 - 247, # 248 - 270, # 271 - 352, # > 352
df_inputs_prepr['mths_since_earliest_cr_line:<140'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:141-164'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(140, 165)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:165-247'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(165, 248)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:248-270'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(248, 271)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:271-352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(271, 353)), 1, 0)
df_inputs_prepr['mths_since_earliest_cr_line:>352'] = np.where(df_inputs_prepr['mths_since_earliest_cr_line'].isin(range(353, int(df_inputs_prepr['mths_since_earliest_cr_line'].max()))), 1, 0)
# delinq_2yrs variable
# We calculate weight of evidence
df_temp12 = woe_ordered_continuous(df_inputs_prepr, 'delinq_2yrs', df_targets_prepr)
df_temp12
# We plot the weight of evidence values
plot_by_woe(df_temp12)
# We create the following Categories: 0, 1-3, >=4
df_inputs_prepr['delinq_2yrs:0'] = np.where((df_inputs_prepr['delinq_2yrs'] == 0), 1, 0)
df_inputs_prepr['delinq_2yrs:1-3'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 1) & (df_inputs_prepr['delinq_2yrs'] <= 3), 1, 0)
df_inputs_prepr['delinq_2yrs:>=4'] = np.where((df_inputs_prepr['delinq_2yrs'] >= 9), 1, 0)
# inq_last_6mths variable
# We calculate weight of evidence.
df_temp13 = woe_ordered_continuous(df_inputs_prepr, 'inq_last_6mths', df_targets_prepr)
df_temp13
# We plot the weight of evidence values
plot_by_woe(df_temp13)
# We create the following Categories: 0, 1 - 2, 3 - 6, > 6
df_inputs_prepr['inq_last_6mths:0'] = np.where((df_inputs_prepr['inq_last_6mths'] == 0), 1, 0)
df_inputs_prepr['inq_last_6mths:1-2'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 1) & (df_inputs_prepr['inq_last_6mths'] <= 2), 1, 0)
df_inputs_prepr['inq_last_6mths:3-6'] = np.where((df_inputs_prepr['inq_last_6mths'] >= 3) & (df_inputs_prepr['inq_last_6mths'] <= 6), 1, 0)
df_inputs_prepr['inq_last_6mths:>6'] = np.where((df_inputs_prepr['inq_last_6mths'] > 6), 1, 0)
# open_acc variable
# We calculate weight of evidence.
df_temp14 = woe_ordered_continuous(df_inputs_prepr, 'open_acc', df_targets_prepr)
df_temp14
# We plot the weight of evidence values
plot_by_woe(df_temp14, 90)
# We plot the weight of evidence values
plot_by_woe(df_temp14.iloc[ : 40, :], 90)
# We create the following Categories: '0', '1-3', '4-12', '13-17', '18-22', '23-25', '26-30', '>30'
df_inputs_prepr['open_acc:0'] = np.where((df_inputs_prepr['open_acc'] == 0), 1, 0)
df_inputs_prepr['open_acc:1-3'] = np.where((df_inputs_prepr['open_acc'] >= 1) & (df_inputs_prepr['open_acc'] <= 3), 1, 0)
df_inputs_prepr['open_acc:4-12'] = np.where((df_inputs_prepr['open_acc'] >= 4) & (df_inputs_prepr['open_acc'] <= 12), 1, 0)
df_inputs_prepr['open_acc:13-17'] = np.where((df_inputs_prepr['open_acc'] >= 13) & (df_inputs_prepr['open_acc'] <= 17), 1, 0)
df_inputs_prepr['open_acc:18-22'] = np.where((df_inputs_prepr['open_acc'] >= 18) & (df_inputs_prepr['open_acc'] <= 22), 1, 0)
df_inputs_prepr['open_acc:23-25'] = np.where((df_inputs_prepr['open_acc'] >= 23) & (df_inputs_prepr['open_acc'] <= 25), 1, 0)
df_inputs_prepr['open_acc:26-30'] = np.where((df_inputs_prepr['open_acc'] >= 26) & (df_inputs_prepr['open_acc'] <= 30), 1, 0)
df_inputs_prepr['open_acc:>=31'] = np.where((df_inputs_prepr['open_acc'] >= 31), 1, 0)
# pub_rec variable
# We calculate weight of evidence
df_temp15 = woe_ordered_continuous(df_inputs_prepr, 'pub_rec', df_targets_prepr)
df_temp15
# We plot the weight of evidence values.
plot_by_woe(df_temp15, 90)
# We create the following Categories: '0-2', '3-4', '>=5'
df_inputs_prepr['pub_rec:0-2'] = np.where((df_inputs_prepr['pub_rec'] >= 0) & (df_inputs_prepr['pub_rec'] <= 2), 1, 0)
df_inputs_prepr['pub_rec:3-4'] = np.where((df_inputs_prepr['pub_rec'] >= 3) & (df_inputs_prepr['pub_rec'] <= 4), 1, 0)
df_inputs_prepr['pub_rec:>=5'] = np.where((df_inputs_prepr['pub_rec'] >= 5), 1, 0)
# total_acc variable
df_inputs_prepr['total_acc'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['total_acc_factor'] = pd.cut(df_inputs_prepr['total_acc'], 50)
# We calculate weight of evidence
df_temp16 = woe_ordered_continuous(df_inputs_prepr, 'total_acc_factor', df_targets_prepr)
df_temp16
# We plot the weight of evidence values
plot_by_woe(df_temp16, 90)
# We create the following Categories: '<=27', '28-51', '>51'
df_inputs_prepr['total_acc:<=27'] = np.where((df_inputs_prepr['total_acc'] <= 27), 1, 0)
df_inputs_prepr['total_acc:28-51'] = np.where((df_inputs_prepr['total_acc'] >= 28) & (df_inputs_prepr['total_acc'] <= 51), 1, 0)
df_inputs_prepr['total_acc:>=52'] = np.where((df_inputs_prepr['total_acc'] >= 52), 1, 0)
# acc_now_delinq variable
# We calculate weight of evidence.
df_temp17 = woe_ordered_continuous(df_inputs_prepr, 'acc_now_delinq', df_targets_prepr)
df_temp17
# We plot the weight of evidence values
plot_by_woe(df_temp17)
# We create the following Categories: '0', '>=1'
df_inputs_prepr['acc_now_delinq:0'] = np.where((df_inputs_prepr['acc_now_delinq'] == 0), 1, 0)
df_inputs_prepr['acc_now_delinq:>=1'] = np.where((df_inputs_prepr['acc_now_delinq'] >= 1), 1, 0)
# total_rev_hi_lim variable
df_inputs_prepr['total_rev_hi_lim'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 2000 categories by its values.
df_inputs_prepr['total_rev_hi_lim_factor'] = pd.cut(df_inputs_prepr['total_rev_hi_lim'], 2000)
# We calculate weight of evidence.
df_temp18 = woe_ordered_continuous(df_inputs_prepr, 'total_rev_hi_lim_factor', df_targets_prepr)
df_temp18
# We plot the weight of evidence values.
plot_by_woe(df_temp18.iloc[: 50, : ], 90)
# We create the following Categories:'<=5K', '5K-10K', '10K-20K', '20K-30K', '30K-40K', '40K-55K', '55K-95K', '>95K'
df_inputs_prepr['total_rev_hi_lim:<=5K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] <= 5000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:5K-10K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 5000) & (df_inputs_prepr['total_rev_hi_lim'] <= 10000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:10K-20K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 10000) & (df_inputs_prepr['total_rev_hi_lim'] <= 20000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:20K-30K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 20000) & (df_inputs_prepr['total_rev_hi_lim'] <= 30000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:30K-40K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 30000) & (df_inputs_prepr['total_rev_hi_lim'] <= 40000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:40K-55K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 40000) & (df_inputs_prepr['total_rev_hi_lim'] <= 55000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:55K-95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 55000) & (df_inputs_prepr['total_rev_hi_lim'] <= 95000), 1, 0)
df_inputs_prepr['total_rev_hi_lim:>95K'] = np.where((df_inputs_prepr['total_rev_hi_lim'] > 95000), 1, 0)
# installment variable
df_inputs_prepr['installment'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['installment_factor'] = pd.cut(df_inputs_prepr['installment'], 50)
# We calculate weight of evidence.
df_temp19 = woe_ordered_continuous(df_inputs_prepr, 'installment_factor', df_targets_prepr)
df_temp19
# We plot the weight of evidence values.
plot_by_woe(df_temp19, 90)
# annual_inc variable
df_inputs_prepr['annual_inc'].unique()
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 50)
# We calculate weight of evidence.
df_temp20 = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)
df_temp20
# Splitting the initial ‘annual income’ variable into 50 categories doesn't work well for fine classing because there are a lot of people with low income and very few people with high income.
# Thus, we do fine-classing using the 'cut' method, we split the variable into 100 categories by its values.
df_inputs_prepr['annual_inc_factor'] = pd.cut(df_inputs_prepr['annual_inc'], 100)
# We calculate weight of evidence.
df_temp20 = woe_ordered_continuous(df_inputs_prepr, 'annual_inc_factor', df_targets_prepr)
df_temp20
# Initial examination shows that there are too few individuals with large income and too many with small income.
# Hence, we are going to have one category for more than 150K, and we are going to apply our approach to determine
# the categories of everyone with 140k or less.
df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['annual_inc'] <= 140000, : ]
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr_temp["annual_inc_factor"] = pd.cut(df_inputs_prepr_temp['annual_inc'], 50)
# We calculate weight of evidence.
df_temp20 = woe_ordered_continuous(df_inputs_prepr_temp, 'annual_inc_factor', df_targets_prepr[df_inputs_prepr_temp.index])
df_temp20
# We plot the weight of evidence values.
plot_by_woe(df_temp20, 90)
# WoE is monotonically decreasing with income, so we split income in 10 equal categories, each with width of 15k.
df_inputs_prepr['annual_inc:<20K'] = np.where((df_inputs_prepr['annual_inc'] <= 20000), 1, 0)
df_inputs_prepr['annual_inc:20K-30K'] = np.where((df_inputs_prepr['annual_inc'] > 20000) & (df_inputs_prepr['annual_inc'] <= 30000), 1, 0)
df_inputs_prepr['annual_inc:30K-40K'] = np.where((df_inputs_prepr['annual_inc'] > 30000) & (df_inputs_prepr['annual_inc'] <= 40000), 1, 0)
df_inputs_prepr['annual_inc:40K-50K'] = np.where((df_inputs_prepr['annual_inc'] > 40000) & (df_inputs_prepr['annual_inc'] <= 50000), 1, 0)
df_inputs_prepr['annual_inc:50K-60K'] = np.where((df_inputs_prepr['annual_inc'] > 50000) & (df_inputs_prepr['annual_inc'] <= 60000), 1, 0)
df_inputs_prepr['annual_inc:60K-70K'] = np.where((df_inputs_prepr['annual_inc'] > 60000) & (df_inputs_prepr['annual_inc'] <= 70000), 1, 0)
df_inputs_prepr['annual_inc:70K-80K'] = np.where((df_inputs_prepr['annual_inc'] > 70000) & (df_inputs_prepr['annual_inc'] <= 80000), 1, 0)
df_inputs_prepr['annual_inc:80K-90K'] = np.where((df_inputs_prepr['annual_inc'] > 80000) & (df_inputs_prepr['annual_inc'] <= 90000), 1, 0)
df_inputs_prepr['annual_inc:90K-100K'] = np.where((df_inputs_prepr['annual_inc'] > 90000) & (df_inputs_prepr['annual_inc'] <= 100000), 1, 0)
df_inputs_prepr['annual_inc:100K-120K'] = np.where((df_inputs_prepr['annual_inc'] > 100000) & (df_inputs_prepr['annual_inc'] <= 120000), 1, 0)
df_inputs_prepr['annual_inc:120K-140K'] = np.where((df_inputs_prepr['annual_inc'] > 120000) & (df_inputs_prepr['annual_inc'] <= 140000), 1, 0)
df_inputs_prepr['annual_inc:>140K'] = np.where((df_inputs_prepr['annual_inc'] > 140000), 1, 0)
# mths_since_last_delinq variable
# We have to create one category for missing values and do fine and coarse classing for the rest.
df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_delinq'])]
df_inputs_prepr_temp['mths_since_last_delinq_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_delinq'], 50)
# We calculate weight of evidence.
df_temp21 = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_delinq_factor', df_targets_prepr[df_inputs_prepr_temp.index])
df_temp21
# We plot the weight of evidence values.
plot_by_woe(df_temp21, 90)
# We create the following Categories: Missing, 0-3, 4-30, 31-56, >=57
df_inputs_prepr['mths_since_last_delinq:Missing'] = np.where((df_inputs_prepr['mths_since_last_delinq'].isnull()), 1, 0)
df_inputs_prepr['mths_since_last_delinq:0-3'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 0) & (df_inputs_prepr['mths_since_last_delinq'] <= 3), 1, 0)
df_inputs_prepr['mths_since_last_delinq:4-30'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 4) & (df_inputs_prepr['mths_since_last_delinq'] <= 30), 1, 0)
df_inputs_prepr['mths_since_last_delinq:31-56'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 31) & (df_inputs_prepr['mths_since_last_delinq'] <= 56), 1, 0)
df_inputs_prepr['mths_since_last_delinq:>=57'] = np.where((df_inputs_prepr['mths_since_last_delinq'] >= 57), 1, 0)
# dti variable
# Here we do fine-classing: using the 'cut' method, we split the variable into 100 categories by its values.
df_inputs_prepr['dti_factor'] = pd.cut(df_inputs_prepr['dti'], 100)
# We calculate weight of evidence.
df_temp22 = woe_ordered_continuous(df_inputs_prepr, 'dti_factor', df_targets_prepr)
df_temp22
# We plot the weight of evidence values.
plot_by_woe(df_temp22, 90)
# Similarly to income, initial examination shows that most values are lower than 200.
# Hence, we are going to have one category for more than 35, and we are going to apply our approach to determine
# the categories of everyone with 150k or less.
df_inputs_prepr_temp = df_inputs_prepr.loc[df_inputs_prepr['dti'] <= 35, : ]
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr_temp['dti_factor'] = pd.cut(df_inputs_prepr_temp['dti'], 50)
# We calculate weight of evidence.
df_temp22 = woe_ordered_continuous(df_inputs_prepr_temp, 'dti_factor', df_targets_prepr[df_inputs_prepr_temp.index])
df_temp22
# We plot the weight of evidence values.
plot_by_woe(df_temp22, 90)
# We create the following Categories:
df_inputs_prepr['dti:<=1.4'] = np.where((df_inputs_prepr['dti'] <= 1.4), 1, 0)
df_inputs_prepr['dti:1.4-3.5'] = np.where((df_inputs_prepr['dti'] > 1.4) & (df_inputs_prepr['dti'] <= 3.5), 1, 0)
df_inputs_prepr['dti:3.5-7.7'] = np.where((df_inputs_prepr['dti'] > 3.5) & (df_inputs_prepr['dti'] <= 7.7), 1, 0)
df_inputs_prepr['dti:7.7-10.5'] = np.where((df_inputs_prepr['dti'] > 7.7) & (df_inputs_prepr['dti'] <= 10.5), 1, 0)
df_inputs_prepr['dti:10.5-16.1'] = np.where((df_inputs_prepr['dti'] > 10.5) & (df_inputs_prepr['dti'] <= 16.1), 1, 0)
df_inputs_prepr['dti:16.1-20.3'] = np.where((df_inputs_prepr['dti'] > 16.1) & (df_inputs_prepr['dti'] <= 20.3), 1, 0)
df_inputs_prepr['dti:20.3-21.7'] = np.where((df_inputs_prepr['dti'] > 20.3) & (df_inputs_prepr['dti'] <= 21.7), 1, 0)
df_inputs_prepr['dti:21.7-22.4'] = np.where((df_inputs_prepr['dti'] > 21.7) & (df_inputs_prepr['dti'] <= 22.4), 1, 0)
df_inputs_prepr['dti:22.4-35'] = np.where((df_inputs_prepr['dti'] > 22.4) & (df_inputs_prepr['dti'] <= 35), 1, 0)
df_inputs_prepr['dti:>35'] = np.where((df_inputs_prepr['dti'] > 35), 1, 0)
# mths_since_last_record variable
# We have to create one category for missing values and do fine and coarse classing for the rest.
#sum(loan_data_temp['mths_since_last_record'].isnull())
df_inputs_prepr_temp = df_inputs_prepr[pd.notnull(df_inputs_prepr['mths_since_last_record'])]
# Here we do fine-classing: using the 'cut' method, we split the variable into 50 categories by its values.
df_inputs_prepr_temp['mths_since_last_record_factor'] = pd.cut(df_inputs_prepr_temp['mths_since_last_record'], 50)
# We calculate weight of evidence.
df_temp23 = woe_ordered_continuous(df_inputs_prepr_temp, 'mths_since_last_record_factor', df_targets_prepr[df_inputs_prepr_temp.index])
df_temp23
# We plot the weight of evidence values.
plot_by_woe(df_temp23, 90)
# We create the following Categories: 'Missing', '0-2', '3-20', '21-31', '32-80', '81-86', '>86'
df_inputs_prepr['mths_since_last_record:Missing'] = np.where((df_inputs_prepr['mths_since_last_record'].isnull()), 1, 0)
df_inputs_prepr['mths_since_last_record:0-2'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 0) & (df_inputs_prepr['mths_since_last_record'] <= 2), 1, 0)
df_inputs_prepr['mths_since_last_record:3-20'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 3) & (df_inputs_prepr['mths_since_last_record'] <= 20), 1, 0)
df_inputs_prepr['mths_since_last_record:21-31'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 21) & (df_inputs_prepr['mths_since_last_record'] <= 31), 1, 0)
df_inputs_prepr['mths_since_last_record:32-80'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 32) & (df_inputs_prepr['mths_since_last_record'] <= 80), 1, 0)
df_inputs_prepr['mths_since_last_record:81-86'] = np.where((df_inputs_prepr['mths_since_last_record'] >= 81) & (df_inputs_prepr['mths_since_last_record'] <= 86), 1, 0)
df_inputs_prepr['mths_since_last_record:>86'] = np.where((df_inputs_prepr['mths_since_last_record'] > 86), 1, 0)
path = os.getcwd()
path
ls
#Making a data folder
os.mkdir('data')
loan_data_inputs_test = df_inputs_prepr
#Exporting the preprocesse train and Test dataset as csv files
loan_data_inputs_train.to_csv(os.getcwd() + r'\data\loan_data_inputs_train.csv')
loan_data_targets_train.to_csv(os.getcwd() + r'\data\loan_data_targets_train.csv')
loan_data_inputs_test.to_csv(os.getcwd() + r'\data\loan_data_inputs_test.csv')
loan_data_targets_test.to_csv(os.getcwd() + r'\data\loan_data_targets_test.csv')